data <- readRDS(file="../data/sub_portugal_5provs_4blocks.rds") %>%
mutate_at(c("block", "prov", "clon", "tree"), as.factor) %>% # formatting data
mutate(age.sc = as.vector(scale(age, center = F))) # as age is definite on R+ I would only reduce it..
# mutate(age.sc = as.vector(scale(age))) # mean centering age
The dataset includes:
table(data$prov,data$block) %>% kable(caption = "Provenance against block number.")
| 34 | 35 | 36 | 38 | |
|---|---|---|---|---|
| LEI | 78 | 71 | 72 | 82 |
| MIM | 44 | 45 | 60 | 60 |
| PIE | 26 | 29 | 34 | 34 |
| SAC | 21 | 20 | 18 | 21 |
| VAL | 37 | 43 | 42 | 42 |
table(data$prov,as.factor(data$age)) %>% kable(caption = "Provenance against age (in months).")
| 11 | 15 | 20 | 27 | |
|---|---|---|---|---|
| LEI | 92 | 84 | 65 | 62 |
| MIM | 72 | 59 | 40 | 38 |
| PIE | 36 | 33 | 27 | 27 |
| SAC | 23 | 23 | 17 | 17 |
| VAL | 48 | 44 | 37 | 35 |
ggplot(data, aes(x=height)) +
geom_histogram(color="darkblue", fill="lightblue") +
theme_bw()
Height versus age.
ggplot(data, aes(x=height, color=as.factor(age))) +
geom_histogram(fill="white", alpha=0.5, position="identity") +
theme_bw() +
facet_wrap(~as.factor(age)) +
theme(legend.position = "none")
Height versus age.
plot_grid(ggplot(data, aes(x=age,y=height)) +
geom_point(alpha=0.2) +
stat_smooth(method = "lm", col = "red") +
theme_bw() +
theme(axis.title=element_text(size=16)),
ggplot(data, aes(x=age,y=height)) +
geom_point(alpha=0.2) +
stat_smooth(method = "lm", col = "red", formula = y~poly(x,2)) +
theme_bw() +
theme(axis.title=element_text(size=16)))
Height versus age.
plot_grid(ggplot(data, aes(x=age,y=log(height))) +
geom_point(alpha=0.2) +
stat_smooth(method = "lm", col = "red") +
theme_bw() +
theme(axis.title=element_text(size=16)),
ggplot(data, aes(x=age,y=log(height))) +
geom_point(alpha=0.2) +
stat_smooth(method = "lm", col = "red", formula = y~poly(x,2)) +
theme_bw() +
theme(axis.title=element_text(size=16)))
Height versus age.
ggplot(data, aes(x=height, color=block)) +
geom_histogram(fill="white", alpha=0.5, position="identity") +
theme_bw() +
facet_grid(prov ~block, labeller = label_both) +
theme(legend.position = "none")
Height distribution by Provenance and Block.
Dummy variables for each level = Regularized intercepts, because we use weakly informative priors. But no information shared between intercepts. (See P299 in Statistical Rethinking of McElreath)
First, we are going to try three different likehoods: the normal distribution, the normal distribution with a log-transformed response variable and a lognormal distribution.
\[\begin{equation} \begin{aligned} h_{i} & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]Mathematical model
Data in a list.
data.list <- list(N=length(data$height), # Number of observations
y=data$height, # Response variables
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
nblock=length(unique(data$block)), # Number of blocks
prov=as.numeric(data$prov), # Provenances
bloc=as.numeric(data$block)) # Blocks
mN = stan_model("mN.stan")
fit.mN <- sampling(mN, data = data.list, iter = 2000, chains = 2, cores = 2)
broom::tidyMCMC(fit.mN,
pars = c("beta_age","beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mN with a normal distribution")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 137.13915 | 7.923203 | 121.877254 | 153.31531 | 0.9991405 | 2163 |
| beta_age2 | 151.45624 | 6.176576 | 139.145932 | 163.13511 | 1.0003495 | 2356 |
| alpha_prov[1] | 45.85882 | 7.330742 | 31.699813 | 60.15782 | 1.0001671 | 2286 |
| alpha_prov[2] | 29.91831 | 7.273491 | 16.116101 | 44.65727 | 0.9996011 | 2554 |
| alpha_prov[3] | 11.93355 | 8.055918 | -3.762625 | 28.12042 | 0.9997846 | 2565 |
| alpha_prov[4] | 20.06939 | 8.698003 | 3.141126 | 37.19453 | 1.0009620 | 2978 |
| alpha_prov[5] | 25.40380 | 8.021383 | 10.039115 | 41.70351 | 0.9991908 | 2433 |
| alpha_block[1] | 21.62438 | 7.732563 | 6.777951 | 36.80888 | 1.0007131 | 2861 |
| alpha_block[2] | 26.49717 | 7.669491 | 11.904573 | 41.58097 | 0.9996315 | 2385 |
| alpha_block[3] | 35.18395 | 7.689631 | 20.033232 | 50.79736 | 0.9998053 | 2609 |
| alpha_block[4] | 49.71694 | 7.681219 | 34.859813 | 64.14071 | 0.9993917 | 2420 |
| sigma_y | 141.45494 | 3.679044 | 134.601916 | 148.91160 | 0.9990337 | 3625 |
| lp__ | -5048.54238 | 2.411856 | -5054.131529 | -5044.93354 | 0.9999696 | 1008 |
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
ppc_dens_overlay(y = data$height,
as.matrix(fit.mN, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
“In the plot above, the dark line is the distribution of the observed outcomes \(y\) and each of the 50 lighter lines is the kernel density estimate of one of the replications of \(y\) from the posterior predictive distribution (i.e., one of the rows in yrep).” From Graphical posterior predictive checks using the bayesplot package.
This plot makes it easy to see that this model poorly fits the data. We can probably do better…
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]Mathematical model
data.list.mNlogy <- list(N=length(data$height), # Number of observations
y=log(data$height), # log(response variable)
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
nblock=length(unique(data$block)), # Number of blocks
prov=as.numeric(data$prov), # Provenances
bloc=as.numeric(data$block)) # Blocks
Same stan code as the previous model!
mNlogy = stan_model("mNlogy.stan")
fit.mNlogy <- sampling(mNlogy, data = data.list.mNlogy, iter = 2000, chains = 2,
control=list(max_treedepth=14), cores = 2, save_warmup = F)
broom::tidyMCMC(fit.mNlogy,
pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogy with a normal distribution and a log-transformed response variable")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0579301 | 0.3350220 | 2.4019656 | 3.7218651 | 1.001877 | 561 |
| beta_age2 | -0.8536282 | 0.1599965 | -1.1683883 | -0.5375173 | 1.001921 | 561 |
| alpha_prov[1] | 1.6513564 | 3.4485964 | -5.8734695 | 7.8363884 | 1.017059 | 226 |
| alpha_prov[2] | 1.6023959 | 3.4494309 | -5.9057697 | 7.7596777 | 1.016993 | 226 |
| alpha_prov[3] | 1.4990742 | 3.4479495 | -6.0281759 | 7.6769490 | 1.017075 | 225 |
| alpha_prov[4] | 1.6128414 | 3.4481695 | -5.9498839 | 7.7709924 | 1.017021 | 226 |
| alpha_prov[5] | 1.6099099 | 3.4488907 | -5.9272419 | 7.7592365 | 1.017103 | 226 |
| alpha_block[1] | 2.0575301 | 3.4408496 | -4.1398833 | 9.5821414 | 1.017544 | 225 |
| alpha_block[2] | 2.0840563 | 3.4407266 | -4.1017959 | 9.6248734 | 1.017555 | 225 |
| alpha_block[3] | 2.1086391 | 3.4405669 | -4.0651311 | 9.6285910 | 1.017572 | 225 |
| alpha_block[4] | 2.2099892 | 3.4412635 | -3.9687104 | 9.7504930 | 1.017570 | 225 |
| sigma_y | 0.3958537 | 0.0099854 | 0.3777618 | 0.4162877 | 1.005812 | 531 |
| lp__ | 372.5545003 | 2.4294536 | 366.5960456 | 375.9544738 | 1.000135 | 514 |
There may be a warning about effective sample size. This warning should disappear if we increase the number of iterations to 2,500.
We have to increase the maximum treedepth: See the Brief Guide to Stan’s Warnings
lp has largely increased.
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
y_rep <- as.matrix(fit.mNlogy, pars = "y_rep")
ppc_dens_overlay(y =log(data$height),
as.matrix(fit.mNlogy, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
A better fit than mN.
\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]Mathematical model
We are going to use the same data list as in the first model mN.
mLogNR = stan_model("mLogNR.stan")
fit.mLogNR <- sampling(mLogNR, data = data.list, iter = 2000,
chains = 2, control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mLogNR,
pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mLogNR with a LogNormal distribution on R")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0555719 | 0.3275468 | 2.3713162 | 3.6605593 | 0.9992367 | 634 |
| beta_age2 | -0.8543036 | 0.1573536 | -1.1465293 | -0.5354714 | 0.9992856 | 628 |
| alpha_prov[1] | 2.1217546 | 3.4084211 | -4.3739902 | 8.6881716 | 1.0131870 | 265 |
| alpha_prov[2] | 2.0607284 | 3.4087468 | -4.4315566 | 8.6440914 | 1.0131141 | 265 |
| alpha_prov[3] | 1.9586754 | 3.4103368 | -4.5475392 | 8.5584074 | 1.0130917 | 265 |
| alpha_prov[4] | 2.0703715 | 3.4066068 | -4.4759199 | 8.6313427 | 1.0131313 | 265 |
| alpha_prov[5] | 2.0709604 | 3.4086466 | -4.4550309 | 8.6671989 | 1.0131089 | 265 |
| alpha_block[1] | 1.6380233 | 3.3995181 | -4.9401125 | 8.1192378 | 1.0134022 | 265 |
| alpha_block[2] | 1.6845306 | 3.4004451 | -4.9253197 | 8.1341568 | 1.0133324 | 265 |
| alpha_block[3] | 1.6904876 | 3.3992693 | -4.9123654 | 8.1533350 | 1.0133388 | 265 |
| alpha_block[4] | 1.7968742 | 3.4003061 | -4.8303802 | 8.2700915 | 1.0133413 | 265 |
| sigma_y | 0.3967370 | 0.0094583 | 0.3804775 | 0.4175419 | 1.0028582 | 508 |
| lp__ | 372.6176314 | 2.4261156 | 366.8486216 | 376.1194057 | 1.0031655 | 542 |
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
ppc_dens_overlay(y = data$height,
as.matrix(fit.mLogNR, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
In the previous models, our priors are very weakly informative. We can use a little more informative priors, that can help convergence and decrease the running time. Belox, we refit the model mNlogy (normal distribution with log(y)) with more informative priors.
Variance priors
Prior recommendations for scale parameters in hierarchical models
Very weakly informative prior: \(\sigma \sim \text{HalfCauchy}(0,25)\). From Gelman (2006): 8-schools example (p430). And here.
Weakly informative prior:
\(\sigma \sim \text{HalfCauchy}(0,1)\) (McElreath, First version) \(\sigma \sim \text{HalfCauchy}(0,5)\) (Betancourt in 8-schools example)
\(\sigma \sim \text{exponential}(1)\) (McElreath, Second version) or \(\sigma \sim \text{exponential}(0.1)\)
\(\sigma \sim \text{LogNormal}(0,1)\) (McElreath, Second version)
More informative prior : \(\sigma \sim \text{HalfNormal}(0,1)\) or \(\sigma \sim \text{Half-t}(3,0,1)\)
Beta priors
More informative priors: \(\beta \sim \mathcal{N}(0,1)\).
We can also consider that \(age\) is positively associated with height so we can add some information in our model and constrain \(\beta_{age}\) to positive values, such as: \(\beta_{age} \sim \text{LogNormal}(0,1)\) or \(\beta_{age} \sim \text{Gamma}(?,?)\).
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
mNlogySigmaPrior = stan_model("mNlogySigmaPrior.stan")
fit.mNlogySigmaPrior <- sampling(mNlogySigmaPrior, data = data.list.mNlogy, iter = 2000, chains = 2, cores = 2,
control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mNlogySigmaPrior,
pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogySigmaPrior with a normal distribution and a log-transformed response variable")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0107288 | 0.3388130 | 2.3715193 | 3.6906658 | 1.0002014 | 679 |
| beta_age2 | -0.8314108 | 0.1619515 | -1.1562546 | -0.5276815 | 1.0002880 | 678 |
| alpha_prov[1] | 1.1708538 | 3.1865215 | -5.0398797 | 7.5622102 | 1.0143187 | 222 |
| alpha_prov[2] | 1.1293683 | 3.1867863 | -5.1017683 | 7.4990827 | 1.0142846 | 222 |
| alpha_prov[3] | 1.0029820 | 3.1870150 | -5.1622819 | 7.3811998 | 1.0143557 | 222 |
| alpha_prov[4] | 1.1032702 | 3.1862411 | -5.0818598 | 7.4614433 | 1.0144236 | 222 |
| alpha_prov[5] | 1.1352337 | 3.1864730 | -5.0916072 | 7.5086448 | 1.0142397 | 222 |
| alpha_block[1] | 2.5951974 | 3.1985722 | -3.9152278 | 8.8790141 | 1.0143058 | 222 |
| alpha_block[2] | 2.6249875 | 3.1987471 | -3.8725381 | 8.9153136 | 1.0143092 | 222 |
| alpha_block[3] | 2.6452764 | 3.1990859 | -3.8989140 | 8.9462457 | 1.0142977 | 222 |
| alpha_block[4] | 2.7457830 | 3.1988621 | -3.7875396 | 9.0413565 | 1.0142977 | 222 |
| sigma_y | 0.3956953 | 0.0096858 | 0.3783233 | 0.4158609 | 1.0039343 | 690 |
| lp__ | 372.2541210 | 2.4180218 | 366.3082466 | 375.5637982 | 0.9993408 | 678 |
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
ppc_dens_overlay(y =log(data$height),
as.matrix(fit.mNlogySigmaPrior, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,1)\\ \alpha_{PROV} & \sim \mathcal{N}(0,1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
mNlogyBetaAlphaPrior = stan_model("mNlogyBetaAlphaPrior.stan")
fit.mNlogyBetaAlphaPrior <- sampling(mNlogyBetaAlphaPrior, data = data.list.mNlogy, iter = 2000, chains = 2, cores = 2,
control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mNlogyBetaAlphaPrior,
pars = c("beta_age", "beta_age2", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogyBetaAlphaPrior with a normal distribution and a log-transformed response variable")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0868389 | 0.3016956 | 2.4850452 | 3.6804960 | 1.000580 | 727 |
| beta_age2 | -0.8649354 | 0.1445793 | -1.1439516 | -0.5865007 | 1.000900 | 722 |
| alpha_prov[1] | 1.6835563 | 0.3265511 | 1.0970084 | 2.3686384 | 1.003928 | 412 |
| alpha_prov[2] | 1.6324825 | 0.3275679 | 1.0234395 | 2.3139650 | 1.003899 | 417 |
| alpha_prov[3] | 1.5305514 | 0.3281026 | 0.9259619 | 2.2072922 | 1.004010 | 421 |
| alpha_prov[4] | 1.6409676 | 0.3279010 | 1.0279154 | 2.3152684 | 1.003384 | 418 |
| alpha_prov[5] | 1.6390326 | 0.3272003 | 1.0523706 | 2.3203585 | 1.004124 | 417 |
| alpha_block[1] | 2.0271343 | 0.3357225 | 1.3103509 | 2.6627104 | 1.004638 | 428 |
| alpha_block[2] | 2.0561796 | 0.3351832 | 1.3451784 | 2.6817137 | 1.004626 | 433 |
| alpha_block[3] | 2.0701832 | 0.3355333 | 1.3652459 | 2.6955298 | 1.004339 | 433 |
| alpha_block[4] | 2.1746311 | 0.3363378 | 1.4604000 | 2.8231571 | 1.004512 | 432 |
| sigma_y | 0.3963958 | 0.0091220 | 0.3797049 | 0.4147040 | 1.000697 | 1081 |
| lp__ | 352.2499899 | 2.4029203 | 346.4175783 | 355.6635881 | 1.001058 | 584 |
Comparison of the distribution of \(y\) and the posterior predictive distributions (from
yrepmatrix).
ppc_dens_overlay(y =log(data$height),
as.matrix(fit.mNlogyBetaAlphaPrior, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
Comment: The intercept parameters change between the two models (mNlogyBetaAlphaPrior and mNlogySigmaPrior).
We use again the model mNlogy (normal distribution with log(y) and very weakly informative priors)
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(\mu_{i},\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{BLOCK[b]} + \alpha_{PROV[p]}\\ \alpha_{BLOCK} & \sim \mathcal{N}(0,10)\\ \alpha_{PROV} & \sim \mathcal{N}(0,10)\\ \beta_{age} & \sim \mathcal{N}(0,10) \\ \beta_{age2} & \sim \mathcal{N}(0,10)\\ \sigma & \sim \text{HalfCauchy}(0,25) \end{aligned} \end{equation}\]Mathematical model
mNlogyVectorize1 = stan_model("mNlogyVectorize1.stan")
fit.mNlogyVectorize1 <- sampling(mNlogyVectorize1, data = data.list.mNlogy, iter = 2000,
control=list(max_treedepth=14),
chains = 2, cores = 2, save_warmup = F)
broom::tidyMCMC(fit.mNlogyVectorize1,
pars = c("beta_age", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogyVectorize1 with a normal distribution and log(y)")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 3.0295758 | 0.3437384 | 2.3854121 | 3.7243783 | 1.0025844 | 517 |
| alpha_prov[1] | 1.7320294 | 3.4026410 | -4.7045217 | 8.3893157 | 1.0063156 | 320 |
| alpha_prov[2] | 1.6830447 | 3.4025796 | -4.7571554 | 8.3275703 | 1.0063898 | 320 |
| alpha_prov[3] | 1.5823837 | 3.4036095 | -4.9119571 | 8.2331163 | 1.0064110 | 320 |
| alpha_prov[4] | 1.6850773 | 3.4019339 | -4.7694287 | 8.3539318 | 1.0063527 | 320 |
| alpha_prov[5] | 1.6921565 | 3.4023780 | -4.7398002 | 8.3212315 | 1.0063940 | 320 |
| alpha_block[1] | 1.9698698 | 3.4082661 | -4.5266003 | 8.5171252 | 1.0065503 | 319 |
| alpha_block[2] | 2.0003218 | 3.4086478 | -4.5091504 | 8.4954238 | 1.0065544 | 319 |
| alpha_block[3] | 2.0176696 | 3.4082629 | -4.4820293 | 8.5405708 | 1.0065507 | 319 |
| alpha_block[4] | 2.1158136 | 3.4083847 | -4.4289952 | 8.6266011 | 1.0066272 | 319 |
| sigma_y | 0.3966256 | 0.0095206 | 0.3782708 | 0.4152456 | 1.0000090 | 896 |
| lp__ | 372.6274861 | 2.3729074 | 366.7477482 | 376.0156133 | 0.9993589 | 705 |
max_treedepth here !mNlogyVectorize2 = stan_model("mNlogyVectorize2.stan")
fit.mNlogyVectorize2 <- sampling(mNlogyVectorize2, data = data.list.mNlogy, iter = 2000,
control=list(max_treedepth=14),
chains = 2, cores = 2, save_warmup = F)
## Warning: The largest R-hat is 1.06, indicating chains have not mixed.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#r-hat
## Warning: Bulk Effective Samples Size (ESS) is too low, indicating posterior means and medians may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#bulk-ess
broom::tidyMCMC(fit.mNlogyVectorize2,
pars = c("beta_age", "alpha_prov", "alpha_block", "sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "Model mNlogyVectorize2 with a normal distribution and log(y)")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 0.4091226 | 0.0138510 | 0.3821943 | 0.4360379 | 0.9999335 | 838 |
| alpha_prov[1] | 1.8450212 | 3.2578420 | -4.5137258 | 8.3510905 | 1.0605418 | 78 |
| alpha_prov[2] | 1.7888478 | 3.2562462 | -4.5845030 | 8.2720471 | 1.0606634 | 77 |
| alpha_prov[3] | 1.6946440 | 3.2581851 | -4.6910023 | 8.2088985 | 1.0604755 | 78 |
| alpha_prov[4] | 1.8247183 | 3.2568439 | -4.5790838 | 8.3067101 | 1.0605361 | 78 |
| alpha_prov[5] | 1.8145836 | 3.2562120 | -4.5532209 | 8.2905805 | 1.0604501 | 78 |
| alpha_block[1] | 3.1120838 | 3.2552218 | -3.3770905 | 9.4613277 | 1.0604816 | 78 |
| alpha_block[2] | 3.1419801 | 3.2548740 | -3.3577467 | 9.4848990 | 1.0606863 | 78 |
| alpha_block[3] | 3.1441316 | 3.2547984 | -3.3183893 | 9.5163807 | 1.0607813 | 77 |
| alpha_block[4] | 3.2769549 | 3.2562493 | -3.2338038 | 9.5880708 | 1.0605484 | 78 |
| sigma_y | 0.4094675 | 0.0095676 | 0.3926185 | 0.4299091 | 1.0026968 | 1152 |
| lp__ | -505.5094293 | 2.4034737 | -510.7965897 | -501.7509199 | 1.0001844 | 614 |
What does this model have a negative lp_ ? Longer to run also, and low number of effective sample size. Why?
Let’s compare the speed of the 5 models with a normal distribution with log(y)
lapply(lapply(c(fit.mNlogy, fit.mNlogySigmaPrior,fit.mNlogyBetaAlphaPrior, fit.mNlogyVectorize1, fit.mNlogyVectorize2),
get_elapsed_time), data.frame) %>%
bind_rows(.id = "model") %>%
mutate(model = recode_factor(model,
"1" = "Model mNlogy with $\\sigma \\sim \\text{HalfCauchy}(0,25)$",
"2" = "Model mNlogySigmaPrior with $\\sigma \\sim \\text{Exponential}(1)$",
"3" = "Model mNlogyBetaAlphaPrior with $\\beta$ and $\\alpha \\sim \\mathcal{N}(0,1)$ ",
"4" = "Model mNlogyVectorize1",
"5" = "Model mNlogyVectorize2")) %>%
mutate(total = warmup + sample) %>%
arrange(total) %>%
kable(caption = "Model speed comparison of models with a normal distribution and a log-transformed response variable.")
| model | warmup | sample | total |
|---|---|---|---|
| Model mNlogyBetaAlphaPrior with \(\beta\) and \(\alpha \sim \mathcal{N}(0,1)\) | 17.4769 | 19.8944 | 37.3713 |
| Model mNlogyBetaAlphaPrior with \(\beta\) and \(\alpha \sim \mathcal{N}(0,1)\) | 16.9092 | 23.2911 | 40.2003 |
| Model mNlogyVectorize2 | 27.6535 | 31.0206 | 58.6741 |
| Model mNlogyVectorize1 | 26.9988 | 31.8640 | 58.8628 |
| Model mNlogyVectorize1 | 28.8041 | 30.4957 | 59.2998 |
| Model mNlogyVectorize2 | 29.4740 | 31.2363 | 60.7103 |
| Model mNlogy with \(\sigma \sim \text{HalfCauchy}(0,25)\) | 67.4086 | 76.5003 | 143.9089 |
| Model mNlogy with \(\sigma \sim \text{HalfCauchy}(0,25)\) | 74.6366 | 71.0961 | 145.7327 |
| Model mNlogySigmaPrior with \(\sigma \sim \text{Exponential}(1)\) | 69.9210 | 75.9105 | 145.8315 |
| Model mNlogySigmaPrior with \(\sigma \sim \text{Exponential}(1)\) | 70.7925 | 82.7382 | 153.5307 |
For the subsequent models, we will use the more informative priors of mNlogyBetaAlphaPrior.
Adaptive regularization
Links:
We are going to
P357 McElreath (first version).
mmNlogyC (multi-level model with normal distribution and log(y) - centered paramerization)
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + \alpha_{PROV[p]}\\ \alpha_{PROV} & \sim \mathcal{N}(\mu_{\alpha_{PROV}},\sigma_{\alpha_{PROV}})\\ \mu_{\alpha_{PROV}} & \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}}& \sim \text{Exponential}(1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
data.list.reduced <- list(N=length(data$height), # Number of observations
y=log(data$height), # Log(Response variable)
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
prov=as.numeric(data$prov)) # Provenances
mmNlogyC <- stan_model("mmNlogyC.stan")
fit.mmNlogyC <- sampling(mmNlogyC, data = data.list.reduced , iter = 2000, chains = 2,
cores = 2, control=list(max_treedepth=14), save_warmup = F)
## Warning: There were 2 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
broom::tidyMCMC(fit.mmNlogyC,
pars = c("beta_age", "beta_age2", "mean_alpha_prov", "sigma_alpha_prov",
"sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "One-varying intercept model mmNlogyC with centered parameterization")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 2.9019767 | 0.3052398 | 2.3184176 | 3.4895899 | 1.008593 | 223 |
| beta_age2 | -0.7748249 | 0.1468177 | -1.0640999 | -0.4943766 | 1.008222 | 230 |
| mean_alpha_prov | 3.7900529 | 0.1500195 | 3.4973060 | 4.0789278 | 1.009119 | 227 |
| sigma_alpha_prov | 0.0606109 | 0.0584825 | 0.0172239 | 0.2093643 | 1.001321 | 491 |
| sigma_y | 0.3998205 | 0.0097379 | 0.3813280 | 0.4185636 | 1.002163 | 820 |
| lp__ | 362.6685007 | 2.5988537 | 356.5332759 | 366.4661911 | 1.002579 | 513 |
This model has some divergent transitions, low number of effective sample size and R-hat not exactly equal to one.
Divergent transitions
adapt_delta..McEleath (Second version) : “[…] the target acceptance rate is controlled by the adapt_delta control parameter. The default is 0.95, which means that it aims to attain a 95% acceptance rate. It tries this during the warmup phase, adjusting the step size of each leapfrog step (go back to Chapter 9 if these terms aren’t familiar). When adapt_delta is set high, it results in a smaller step size, which means a more accurate approximation of the curved surface. It also means more computation, which means a slower chain. Increasing adapt_delta can often, but not always, help with divergent transitions.”
You can also try to add some information in your model, for instance with \(\beta_{age} \sim \text{LogNormal}(0,1)\).
Or you can reparameterize your model with the non-centered parameterization. (Best option!!)
ppc_dens_overlay(y = log(data$height),
as.matrix(fit.mmNlogyC, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
mcmc_trace(as.array(fit.mmNlogyC), pars = c("alpha_prov[3]","sigma_alpha_prov"),
np = nuts_params(fit.mmNlogyC)) +
xlab("Post-warmup iteration")
mcmc_pairs(as.array(fit.mmNlogyC), np = nuts_params(fit.mmNlogyC),
pars = c("beta_age","beta_age2","mean_alpha_prov","sigma_alpha_prov","alpha_prov[3]","sigma_y"),
off_diag_args = list(size = 1, alpha = 1/3),
np_style = pairs_style_np(div_size=2, div_shape = 19))
Links:
From McElreath, P429 (13.4.2.) of Statistical Rethinking (second version)
\[ \alpha \sim \mathcal{N}(\mu,\sigma)\]
is equivalent to
\[\begin{equation} \begin{aligned} \alpha &= \mu + \beta\\ \beta &\sim \mathcal{N}(0,\sigma) \end{aligned} \end{equation}\]is equivalent to
\[\begin{equation} \begin{aligned} \alpha &= \mu + z\sigma\\ z &\sim \mathcal{N}(0,1) \end{aligned} \end{equation}\]No parameters are left inside the prior.
From Updating: A Set of Bayesian Notes. Jeffrey B. Arnold. 20 Multilevel Models
These are two ways of writing the same model. However, they change the parameters that the HMC algorithm is actively sampling and thus can have different sampling performance.
However, neither is universally better.
And there is currently no ex-ante way to know which will work better, and at what amount of “data” that the performance of one or the other is better. However, one other reason to use the centered parameterization (if it is also scaled), is that the Stan HMC implementation tends to be more efficient if all parameters are on the scale.
mmNlogyNC (multi-level model with normal distribution and log(y) - non-centered paramerization)
\[\begin{equation} \begin{aligned} \text{log}(h_{i}) & \sim \mathcal{N}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = \alpha + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} + z_{PROV[p]}\sigma_{PROV}\\ \alpha & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{\alpha_{PROV}}& \sim \text{Exponential}(1)\\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
mmNlogyNC <- stan_model("mmNlogyNC.stan")
fit.mmNlogyNC <- sampling(mmNlogyNC, data = data.list.reduced, iter = 2000, chains = 2,
cores = 2, control=list(max_treedepth=14), save_warmup = F)
broom::tidyMCMC(fit.mmNlogyNC,
pars = c("beta_age", "beta_age2", "z_prov", "sigma_prov","alpha",
"sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "One-varying intercept model mmNlogyNC with non-centered parameterization")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 2.8963848 | 0.2936762 | 2.3219892 | 3.4556964 | 0.9997297 | 707 |
| beta_age2 | -0.7719118 | 0.1417713 | -1.0441907 | -0.4943003 | 0.9996088 | 721 |
| z_prov[1] | 0.8382786 | 0.6527829 | -0.3638923 | 2.2019258 | 1.0002800 | 959 |
| z_prov[2] | 0.1255077 | 0.6335012 | -1.1256549 | 1.3308042 | 0.9992908 | 1033 |
| z_prov[3] | -1.0181842 | 0.7390431 | -2.5026089 | 0.4347973 | 1.0011316 | 771 |
| z_prov[4] | 0.1321020 | 0.6979406 | -1.3108607 | 1.4895910 | 0.9992205 | 1224 |
| z_prov[5] | 0.2273153 | 0.6506613 | -1.0936794 | 1.5297297 | 0.9995888 | 1093 |
| sigma_prov | 0.0604593 | 0.0679792 | 0.0147948 | 0.2267167 | 1.0027545 | 434 |
| alpha | 3.7902642 | 0.1495662 | 3.4877374 | 4.0659309 | 0.9995612 | 659 |
| sigma_y | 0.4000639 | 0.0099042 | 0.3811835 | 0.4203267 | 0.9991321 | 1643 |
| lp__ | 348.5608194 | 2.9013131 | 341.8144068 | 352.9438544 | 1.0041025 | 412 |
No more divergent transitions! But still low effective sample size.
ppc_dens_overlay(y = log(data$height),
as.matrix(fit.mmNlogyNC, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
mcmc_trace(as.array(fit.mmNlogyNC),
pars =c( "z_prov[3]","sigma_prov"),
np = nuts_params(fit.mmNlogyNC)) +
xlab("Post-warmup iteration")
## No divergences to plot.
mcmc_pairs(as.array(fit.mmNlogyNC), np = nuts_params(fit.mmNlogyNC),
pars = c("beta_age","beta_age2","alpha","sigma_prov","z_prov[3]","sigma_y"),
off_diag_args = list(size = 1, alpha = 1/3),
np_style = pairs_style_np(div_size=3, div_shape = 19))
For a lognormal distribution the non-centered parameterisation is different:
\[\alpha \sim \mathcal{logN}(log(\mu),\sigma)\]
is equivalent to
\[\begin{equation} \begin{aligned} \alpha &= e^{log(\mu) + z.\sigma} \\ z &\sim \mathcal{N}(0,1) \end{aligned} \end{equation}\]mmLogNnc (multi-level LogNormal dsitribution - non-centered paramerization)
\[\begin{equation} \begin{aligned} h_{i} & \sim \text{LogNormal}(log(\mu_{i}),\sigma_{i})\\ \mu_{i} & = e^{log(\alpha) + z_{PROV[p]}\sigma_{PROV}} + \beta_{age}age_{i} + \beta_{age2}age^{2}_{i} \\ \alpha & \sim \text{LogNormal}(0,1) \\ \beta_{age} & \sim \mathcal{N}(0,1) \\ \beta_{age2} & \sim \mathcal{N}(0,1)\\ z_{PROV} & \sim \mathcal{N}(0,1)\\ \sigma_{PROV} & \sim \text{Exponential}(1)\\ \sigma & \sim \text{Exponential}(1) \end{aligned} \end{equation}\]Mathematical model
data.list.reduced.logN <- list(N=length(data$height), # Number of observations
y=data$height, # Response variable
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
prov=as.numeric(data$prov)) # Provenances
mmLogNnc<- stan_model("mmLogNnc.stan")
fit.mmLogNnc <- sampling(mmLogNnc, data = data.list.reduced.logN, iter = 2000, chains = 2,
cores = 2, control=list(max_treedepth=14), save_warmup = F)
## Warning: There were 16 divergent transitions after warmup. Increasing adapt_delta above 0.8 may help. See
## http://mc-stan.org/misc/warnings.html#divergent-transitions-after-warmup
## Warning: Examine the pairs() plot to diagnose sampling problems
## Warning: Tail Effective Samples Size (ESS) is too low, indicating posterior variances and tail quantiles may be unreliable.
## Running the chains for more iterations may help. See
## http://mc-stan.org/misc/warnings.html#tail-ess
broom::tidyMCMC(fit.mmLogNnc,
pars = c("beta_age", "beta_age2", "z_prov", "sigma_prov","alpha",
"sigma_y", "lp__"),
droppars = NULL, estimate.method = "median", ess = T, rhat = T, conf.int = T) %>%
kable(caption = "One-varying intercept model mmLogNnc with Lognormal non-centered parameterization")
| term | estimate | std.error | conf.low | conf.high | rhat | ess |
|---|---|---|---|---|---|---|
| beta_age | 1.0893722 | 1.0166708 | -0.8637837 | 3.1690709 | 0.9991699 | 1714 |
| beta_age2 | 2.2029369 | 0.9547520 | 0.3538261 | 4.1541712 | 1.0008084 | 1847 |
| z_prov[1] | 0.6858172 | 0.7519873 | -0.8962745 | 2.1977663 | 1.0005479 | 486 |
| z_prov[2] | -0.2466287 | 0.7600026 | -1.7962695 | 1.3257914 | 1.0020855 | 318 |
| z_prov[3] | -0.4903074 | 0.8381313 | -2.1682095 | 1.1721589 | 1.0072173 | 527 |
| z_prov[4] | 0.2150199 | 0.7882349 | -1.4275055 | 1.7074164 | 1.0001439 | 870 |
| z_prov[5] | 0.3250498 | 0.7586772 | -1.2573239 | 1.6897075 | 1.0028006 | 506 |
| sigma_prov | 0.0505327 | 0.0533628 | 0.0030357 | 0.2043873 | 1.0014523 | 433 |
| alpha | 314.2389761 | 13.7904298 | 277.6893436 | 335.2116801 | 1.0107708 | 114 |
| sigma_y | 0.5777587 | 0.0136868 | 0.5525291 | 0.6051340 | 0.9992581 | 1949 |
| lp__ | 15.9954343 | 2.7988806 | 9.9019425 | 20.6543876 | 1.0020345 | 502 |
ppc_dens_overlay(y = data$height,
as.matrix(fit.mmLogNnc, pars = "y_rep")[1:50, ]) +
theme_bw() +
theme(legend.text=element_text(size=25), legend.title=element_text(size=18),
axis.text = element_text(size=18), legend.position = c(0.8,0.6))
mcmc_trace(as.array(fit.mmLogNnc),
pars =c( "z_prov[3]","sigma_prov"),
np = nuts_params(fit.mmLogNnc)) +
xlab("Post-warmup iteration")
mcmc_pairs(as.array(fit.mmLogNnc), np = nuts_params(fit.mmLogNnc),
pars = c("beta_age","beta_age2","alpha","sigma_prov","z_prov[3]","sigma_y"),
off_diag_args = list(size = 1, alpha = 1/3),
np_style = pairs_style_np(div_size=3, div_shape = 19))
data.list_mod2_2 <- list(N=length(data$height), # Number of observations
y=data$height, # Response variables
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
nblock=length(unique(data$block)), # Number of blocks
prov=as.numeric(data$prov), # Provenances
bloc=as.numeric(data$block)) # Blocks
mod2_2 = stan_model("mod2_2.stan")
fit.mod2_2 <- sampling(mod2_2, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_2, pars = c("beta_age","beta_age2",
"alpha_prov","alpha_block",
"mean_alpha_prov","sigma_alpha_prov",
"mean_alpha_block","sigma_alpha_block",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod2_2, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
McElreath: “[…] note that there is only one global mean parameter \(\alpha\), and both of the varying intercept parameters are centered at zero. We can’t identify a separate mean for each varying intercept type, because both intercepts are added to the same linear prediction. So it is conventional to define varying intercepts with a mean of zero, so there’s no risk of accidentally creating hard-to-identify parameters.” “If you do include a mean for each cluster type, it won’t be the end of the world, however.”
mod2_2_otherpriors = stan_model("mod2_2_otherpriors.stan")
fit.mod2_2_otherpriors <- sampling(mod2_2_otherpriors, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_2_otherpriors, pars = c("beta_age","beta_age2",
"alpha_prov","alpha_block",
"mean_alpha_prov","sigma_alpha_prov",
"mean_alpha_block","sigma_alpha_block",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
mod2_3 = stan_model("mod2_3.stan")
fit.mod2_3 <- sampling(mod2_3, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_3, pars = c("beta_age","beta_age2",
"alpha",
"alpha_prov","alpha_block",
"sigma_alpha_prov","sigma_alpha_block",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod2_3, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
mod2_3_otherpriors = stan_model("mod2_3_otherpriors.stan")
fit.mod2_3_otherpriors <- sampling(mod2_3_otherpriors, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_3_otherpriors, pars = c("beta_age","beta_age2",
"alpha",
"alpha_prov","alpha_block",
"sigma_alpha_prov","sigma_alpha_block",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
\(\alpha\) is very different between the two models!
With more iterations:
fit.mod2_3_otherpriors_iter3000 <- sampling(mod2_3_otherpriors, data = data.list_mod2_2, iter = 3000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_3_otherpriors_iter3000, pars = c("beta_age","beta_age2",
"alpha",
"alpha_prov","alpha_block",
"sigma_alpha_prov","sigma_alpha_block",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
mod2_4 = stan_model("mod2_4.stan")
fit.mod2_4 <- sampling(mod2_4, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_4, pars = c("beta_age","beta_age2",
"alpha",
"z_prov","z_block",
"sigma_prov","sigma_block",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod2_4, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
posterior_cp <- as.array(fit.mod2_4)
np_cp <- nuts_params(fit.mod2_4)
mcmc_trace(posterior_cp, pars = c("alpha", "sigma_block", "sigma_prov","z_block[3]"), np = np_cp) +
xlab("Post-warmup iteration")
mod2_4_otherpriors = stan_model("mod2_4_otherpriors.stan")
fit.mod2_4_otherpriors <- sampling(mod2_4_otherpriors, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14))
print(fit.mod2_4_otherpriors, pars = c("beta_age","beta_age2",
"alpha",
"z_prov","z_block",
"sigma_prov","sigma_block",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod2_4_otherpriors, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
posterior_cp <- as.array(fit.mod2_4_otherpriors)
np_cp <- nuts_params(fit.mod2_4_otherpriors)
mcmc_trace(posterior_cp, pars = c("alpha", "sigma_block", "sigma_prov","z_block[3]"), np = np_cp) +
xlab("Post-warmup iteration")
Let’s increase the target acceptance (adapt_delta=0.99)
McEleath (Second version) : “[…] the target acceptance rate is controlled by the adapt_delta control parameter. The default is 0.95, which means that it aims to attain a 95% acceptance rate. It tries this during the warmup phase, adjusting the step size of each leapfrog step (go back to Chapter 9 if these terms aren’t familiar). When adapt_delta is set high, it results in a smaller step size, which means a more accurate approximation of the curved surface. It also means more computation, which means a slower chain. Increasing adapt_delta can often, but not always, help with divergent transitions.”
fit.mod2_4_otherpriors_adaptdelta <- sampling(mod2_4_otherpriors, data = data.list_mod2_2, iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.99))
print(fit.mod2_4_otherpriors_adaptdelta, pars = c("beta_age","beta_age2",
"alpha",
"z_prov","z_block",
"sigma_prov","sigma_block",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
Longer to run, but it’s ok now ! And similar to model mod2_3_otherpriors (model with centered parameterization and more informative priors).
data.list_mod3 <- list(N=length(data$height), # Number of observations
y=data$height, # Response variables
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
nblock=length(unique(data$block)), # Number of blocks
prov=as.numeric(data$prov), # Provenances
bloc=as.numeric(data$block)) # Blocks
mod3_1In this model, I followed the example from here: Stan code of Statistical Rethinking. 13.3 Example: cross-classified chimpanzees with varying slopes.
Even with 99% acceptance rate (adapt_delta=0.99)` and 3000 iterations, the model had some divergent transitions and small sample sizes.
One thing I didn’t understand with this model code is: where are LKJ and \(\sigma_{\alpha_{BLOCK}}\) priors?
mod3_1 = stan_model("mod3_1.stan")
fit.mod3_1 <- sampling(mod3_1, data = data.list_mod3 , iter = 3000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.99))
print(fit.mod3_1, probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod3_1, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
Let’s call the correlation matrix \(\mathbf{R}\):
\[\mathbf{R} = \begin{pmatrix} 1 & \rho \\ \rho & 1 \end{pmatrix}\] If the prior was:
\[\mathbf{R} \sim \text{LKJcorr(2)}\]
There were more divergent transitions.
If the prior was:
\[\mathbf{R} \sim \text{LKJcorr(4)}\]
There were less divergent transitions, see the model below.
McElreath: “So whatever is the LKJcorr distribution? What LKJcorr(2) does is define a weakly informative prior on \(\rho\) that is skeptical of extreme correlations near −1 or 1. You can think of it as a regularizing prior for correlations. This distribution has a single parameter, \(\eta\), that controls how skeptical the prior is of large correlations in the matrix. When we use LKJcorr(1), the prior is flat over all valid correlation matrices. When the value is greater than 1, such as the 2 we used above, then extreme correlations are less likely. To visualize this family of priors, it will help to sample random matrices from it and plot the distribution of correlations”
Rho2 <- rlkjcorr( 1e4 , K=2 , eta=2 )
Rho1 <- rlkjcorr( 1e4 , K=2 , eta=1 )
Rho4 <- rlkjcorr( 1e4 , K=2 , eta=4 )
plot_grid(dens(Rho1[,1,2],xlim=c(-1,1),xlab="correlation" ,ylim=c(0,1.2),main="eta=1"),
dens(Rho2[,1,2] , xlab="correlation" ,xlim=c(-1,1),ylim=c(0,1.2),main="eta=2"),
dens(Rho4[,1,2],xlim=c(-1,1),xlab="correlation" ,ylim=c(0,1.2),main="eta=4"))
mod3_2Model with \(\mathbf{R} \sim \text{LKJcorr(4)}\)
mod3_2 = stan_model("mod3_2.stan")
fit.mod3_2 <- sampling(mod3_2, data = data.list_mod3 , iter = 3000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.99))
print(fit.mod3_2, pars = c("beta_age","beta_age2",
"alpha","alpha_block", "sigma_block",
"Rho_prov","sigma_prov","alpha_prov","beta_prov","v_prov","SRS_prov",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod3_2, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
Statistical rethinking (first version):
m13_6NC1P405
Stan code here: Code model using non-centered parameterization.
mod3_3 = stan_model("mod3_3.stan")
fit.mod3_3 <- sampling(mod3_3, data = data.list_mod3 , iter = 3000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.99))
print(fit.mod3_3, pars = c("beta_age","beta_age2",
"alpha","z_alpha_block", "sigma_block",
"Rho_prov","sigma_prov","z_alpha_prov","z_beta_prov","v_prov",
"sigma_y"), probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod3_3, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
posterior_cp <- as.array(fit.mod3_3)
np_cp <- nuts_params(fit.mod3_3)
mcmc_trace(posterior_cp, pars =c( "alpha","sigma_prov[1]"), np = np_cp) +
xlab("Post-warmup iteration")
mcmc_pairs(posterior_cp, np = np_cp, pars = c("alpha","beta_age","beta_age2","sigma_y","sigma_prov[1]","sigma_prov[2]"),
off_diag_args = list(size = 1, alpha = 1/3),np_style = pairs_style_np(div_size=3, div_shape = 19))
Comment: I tried \(\alpha \sim \mathcal{N}(0,1)\), chains didn’t mix. Very high R-hat values and lots of divergent transitions.
Sorensen et al. 2016. Listing 8.
data.list_mod3_4 <- list(N=length(data$height), # Number of observations
y=data$height, # Response variables
age=data$age.sc, # Tree age
nprov=length(unique(data$prov)), # Number of provenances
nblock=length(unique(data$block)), # Number of blocks
prov=as.numeric(data$prov), # Provenances
bloc=as.numeric(data$block)) # Blocks
mod3_4 = stan_model("mod3_4.stan")
fit.mod3_4 <- sampling(mod3_4, data = data.list_mod3_4 , iter = 2000, chains = 2, cores = 2, control=list(max_treedepth=14,adapt_delta=0.999))
print(fit.mod3_4, probs = c(0.10, 0.5, 0.9))
y_rep <- as.matrix(fit.mod3_4, pars = "y_rep")
ppc_dens_overlay(y =data$height,y_rep[1:50, ]) + theme_bw() + theme(legend.text=element_text(size=25),
legend.title=element_text(size=18),
axis.text = element_text(size=18),
legend.position = c(0.8,0.6))
Comment: \(\alpha\) has very different values between McElreath and Sorensen models..